In [1]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer

Setup text df of text


In [2]:
corpus_dict = {1: 'This is the first document.',
          2: 'This is the second second document.',
          3: 'And the third one.',
          4: 'Is this the first document?'}

df = pandas.DataFrame(corpus_dict.items(), columns=['id', 'text'])
print(df.shape)


(4, 2)

In [3]:
print(df)


   id                                 text
0   1          This is the first document.
1   2  This is the second second document.
2   3                   And the third one.
3   4          Is this the first document?

In [4]:
# get list of 'text' column
list_of_texts = df['text'].tolist()
print(list_of_texts)


['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']

Setup scikit vectorizer


In [5]:
vectorizer = CountVectorizer(min_df=1)
term_doc_matrix = vectorizer.fit_transform(list_of_texts)

In [7]:
vectorizer.get_feature_names()


Out[7]:
[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']

In [8]:
print(vectorizer.get_feature_names())
print(term_doc_matrix.toarray())


[u'and', u'document', u'first', u'is', u'one', u'second', u'the', u'third', u'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]

In [9]:
# Put BoW vectors into a new df
df_bow = pandas.DataFrame(term_doc_matrix.toarray(), columns=vectorizer.get_feature_names())

In [10]:
print(df_bow)


   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         1      0   1    0       2    1      0     1
2    1         0      0   0    1       0    1      1     0
3    0         1      1   1    0       0    1      0     1

Merge two dfs


In [11]:
# Important: Make sure the concat() function uses the original id index of the first, text datafram
result = pandas.concat([df, df_bow], axis=1, join_axes=[df.index])

In [12]:
result


Out[12]:
id text and document first is one second the third this
0 1 This is the first document. 0 1 1 1 0 0 1 0 1
1 2 This is the second second document. 0 1 0 1 0 2 1 0 1
2 3 And the third one. 1 0 0 0 1 0 1 1 0
3 4 Is this the first document? 0 1 1 1 0 0 1 0 1

In [ ]: